The aim of this project was to build numerous classification models using single-cell image mass cytometry (IMC), gene expression, and clinical data to predict the survival status for breast cancer patients. With advancements in high-throughput technologies such as IMC, I wanted to investigate whether using datasets built on this new technology has more predictive power than using a more novel approach. Thus, a comprehensive analysis using cross-validation and model stability was conducted to determine the dataset with the most predictive power.
model_cv_feature = function(data, k=10, ncv = 5, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = FALSE, exv = NULL){
set.seed(seed)
k = k
ncv = ncv
data$tag = cvTools::cvFolds(nrow(data), ncv)$which
a_logistic= c()
a_logistic_full = c()
a_rf= c()
a_knn = c()
a_xgb= c()
a_logistic1= c()
a_logistic_full1 = c()
a_rf1= c()
a_knn1 = c()
a_xgb1= c()
a_steplog_kap= c()
a_log_kap = c()
a_rf_kap= c()
a_knn_kap = c()
a_xgb_kap= c()
a_steplog_kap1 = c()
a_log_kap1 = c()
a_rf_kap1= c()
a_knn_kap1 = c()
a_xgb_kap1= c()
a_steplog_balanced = c()
a_log_balanced = c()
a_rf_balanced = c()
a_knn_balanced = c()
a_xgb_balanced= c()
a_steplog_balanced1 = c()
a_log_balanced1 = c()
a_rf_balanced1= c()
a_knn_balanced1 = c()
a_xgb_balanced1= c()
for (j in 1:k){
data = data %>% mutate(tag = sample(tag))
if (j == 1){
if (categorical_exist == TRUE){
categorical_cols = data %>% select_if(negate(is.numeric)) %>% select(-sample_id) %>% colnames()
original_non_categorical = setdiff(data %>% colnames(), categorical_cols)
nn = data %>% mlr::createDummyFeatures(cols =categorical_cols) %>%
select(-original_non_categorical) %>% colnames()
data = data %>% mlr::createDummyFeatures(cols =categorical_cols)
d2 = data
}else{
nn = NULL
d2 = data
}
}
for (i in 1:ncv){
if (fs == TRUE){
test = data %>% filter(tag == i)
rownames(test) = test$sample_id
train = data %>% filter(tag != i)
rownames(train) = train$sample_id
pheno = train %>% select(event)
express = train %>% select(-sample_id,-event, -tag,-exv) %>% t()
set = ExpressionSet(assayData = express, phenoData = AnnotatedDataFrame(pheno))
design = model.matrix(~event, pheno)
fit = lmFit(set[, rownames(design)], design, method = "ls")
efit = eBayes(fit, trend = TRUE)
dt_fdr <- decideTests(efit)
top10 <- topTable(efit, n = top) %>% rownames()
f_test = test %>% select(top10, event, exv)
f_train = train %>% select(top10, event,exv)
d2 = data %>% select(top10, event, tag,exv)
}else{
f_test = data %>% filter(tag == i)%>% select(-tag, -sample_id)
f_train = data %>% filter(tag != i) %>% select(-tag, -sample_id)
d2 = data %>% select(-sample_id)
}
if (w == FALSE){
w1 = NULL
}else{
w1 = ifelse(f_train$event == 0,
(1/table(f_train$event)[1]) * 0.5,
(1/table(f_train$event)[2]) * 0.5)
}
#logistic
logistic = glm(event~., data = f_train, family = binomial, weights =w1)
step_logistic = step(logistic, trace = FALSE)
preds = as.factor(round(predict(step_logistic,f_test, type = "response")))
truth = factor(f_test$event, levels=c(0,1))
a_logistic[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[1]
a_steplog_kap[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[2]
a_steplog_balanced[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$byClass[11]
#full logistic
preds = as.factor(round(predict(logistic,f_test, type = "response")))
truth = factor(f_test$event, levels=c(0,1))
a_logistic_full[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[1]
a_log_kap[i] =caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[2]
a_log_balanced[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$byClass[11]
# Random Forest
rf = rpart(factor(event) ~ ., data = f_train, weights = w1)
preds = as.factor(predict(rf,f_test, type = "class"))
truth = factor(f_test$event, levels=c(0,1))
a_rf[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[1]
a_rf_kap[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[2]
a_rf_balanced[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$byClass[11]
# KNN
train_scaled = d2 %>% as.data.frame() %>% mutate(y = factor(event), tag = factor(tag)) %>% mutate(across(where(is.numeric),.fns = scale)) %>% filter(tag != paste0(i)) %>% select(-tag,-nn)
test_scaled = d2 %>% as.data.frame() %>% mutate(y = factor(event), tag = factor(tag)) %>% mutate(across(where(is.numeric),.fns = scale)) %>% filter(tag == paste0(i)) %>% select(-tag,-nn)
X_train = train_scaled %>% select(-event,-y) %>% select_if(is.numeric)
y_train = train_scaled %>% select(y) %>% pull()
n = length(y_train)
X_test = test_scaled %>% select(-event,-y) %>% select_if(is.numeric)
y_test = test_scaled %>% select(y) %>% pull()
preds = class::knn(train = X_train, test = X_test, cl = y_train, k = 5)
truth = y_test %>% factor(levels=c(0,1))
tab <- table(preds,truth)
a_knn[i] = sum(diag(tab)/(sum(rowSums(tab))))
a_knn_kap[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[2]
a_knn_balanced[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$byClass[11]
# XGBOOST
xgb <- xgboost(data = data.matrix(f_train %>% select(-event)),
label = f_train %>% select(event) %>% as.matrix(),
max.depth = 2, eta = 1, nthread = 2, nrounds = 2,verbose = 0, weight =w1)
preds = predict(xgb,data.matrix(f_test%>% select(-event)))
preds = as.numeric(preds > 0.5) %>% factor(levels=c(0,1))
truth = factor(f_test$event,levels=c(0,1))
tab <- table(preds,truth)
a_xgb[i] = sum(diag(tab)/(sum(rowSums(tab))))
a_xgb_kap[i] =caret::confusionMatrix(data = preds, reference = truth, positive = "1")$overall[2]
a_xgb_balanced[i] = caret::confusionMatrix(data = preds, reference = truth, positive = "1")$byClass[11]
}
a_logistic1[j] = mean(a_logistic, na.rm = TRUE)
a_logistic_full1[j] = mean(a_logistic_full,na.rm = TRUE)
a_rf1[j]= mean(a_rf, na.rm = TRUE)
a_knn1[j] = mean(a_knn, na.rm = TRUE)
a_xgb1[j]= mean(a_xgb, na.rm = TRUE)
a_steplog_balanced1[j] = mean(a_steplog_balanced,na.rm = TRUE)
a_log_balanced1[j] = mean(a_log_balanced, na.rm = TRUE)
a_rf_balanced1[j]= mean(a_rf_balanced, na.rm = TRUE)
a_knn_balanced1[j] = mean(a_knn_balanced, na.rm = TRUE)
a_xgb_balanced1[j]= mean(a_xgb_balanced,na.rm = TRUE)
a_steplog_kap1[j] = mean(a_steplog_kap, na.rm = TRUE)
a_log_kap1[j] = a_log_kap%>% mean(na.rm = TRUE)
a_rf_kap1[j]= a_rf_kap%>% mean(na.rm = TRUE)
a_knn_kap1[j] =a_knn_kap%>% mean(na.rm = TRUE)
a_xgb_kap1[j]=a_xgb_kap %>% mean(na.rm = TRUE)
}
means = cbind(step_log = a_logistic1 %>% mean(na.rm = TRUE),
full_log = a_logistic_full1 %>% mean(na.rm = TRUE),
rf = a_rf1 %>% mean(na.rm = TRUE),
knn = a_knn1 %>% mean(na.rm = TRUE),
xgb = a_xgb1 %>% mean(na.rm = TRUE)) %>% as.data.frame()
kappas = cbind(step_log = a_steplog_kap1,
full_log = a_log_kap1 ,
rf = a_rf_kap1 ,
knn = a_knn_kap1 ,
xgb = a_xgb_kap1) %>% as.data.frame()
accuracies = cbind(step_log = a_logistic1,
full_log = a_logistic_full1,
rf = a_rf1,
knn = a_knn1,
xgb = a_xgb1) %>% as.data.frame()
balanced_accuracies = cbind(step_log = a_steplog_balanced1,
full_log = a_log_balanced1,
rf = a_rf_balanced1,
knn = a_knn_balanced1,
xgb = a_xgb_balanced1) %>% as.data.frame()
return(list(means, accuracies, kappas, balanced_accuracies))
}gt_create <- function(data){
data1= data[1] %>% as.data.frame()
colnames(data1) = gsub("_"," ",colnames(data1))%>% str_to_title()
data2 = data[3] %>% as.data.frame() %>% mutate_all(mean,na.rm = TRUE) %>% dplyr::slice(1)
colnames(data2) = gsub("_"," ",colnames(data2))%>% str_to_title()
data3_ba = data[4] %>% as.data.frame() %>% mutate_all(mean,na.rm = TRUE) %>% dplyr::slice(1)
colnames(data3_ba) = gsub("_"," ",colnames(data3_ba))%>% str_to_title()
best1 = data1 %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% t() %>% colnames()
worst1 = data1 %>% t() %>% as.data.frame() %>% filter(V1 == min(V1)) %>% t() %>% colnames()
best2 = data2 %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% t() %>% colnames()
worst2 = data2 %>% t() %>% as.data.frame() %>% filter(V1 == min(V1)) %>% t() %>% colnames()
best3_ba = data3_ba %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% t() %>% colnames()
worst3_ba = data3_ba %>% t() %>% as.data.frame() %>% filter(V1 == min(V1)) %>% t() %>% colnames()
rbind(Accuracy = data1,Kappa =data2, Balanced_Accuracy = data3_ba) %>% mutate(rows = c("Accuracy","Kappa", "Balanced Accuracy")) %>% gt::gt(rowname_col = "rows") %>%
tab_style(
style = list(cell_text(weight = "bold"), cell_fill("lightcyan")),
locations = cells_column_labels(columns = vars("Step Log","Full Log","Rf","Knn","Xgb" ))
)%>%
tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),
locations = cells_body(
columns = best1[1], "Accuracy")
)%>%
tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),
locations = cells_body(
columns = worst1[1], rows = "Accuracy")
)%>%
tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),
locations = cells_body(
columns = best2[1], "Kappa")
)%>%
tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),
locations = cells_body(
columns = worst2[1], rows = "Kappa")
)%>%
tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),
locations = cells_body(
columns = best3_ba[1], "Balanced Accuracy")
)%>%
tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),
locations = cells_body(
columns = worst3_ba[1], rows = "Balanced Accuracy")
)
}gt_create_multi <- function(data, names){
datas = data.frame()
best_a = c()
worst_a = c()
best_k = c()
worst_k = c()
best_ba = c()
worst_ba = c()
for (i in 1:length(data)){
da = data[[i]][[1]]
colnames(da) = gsub("_"," ",colnames(da))%>% str_to_title()
dk = data[[i]][[3]] %>% mutate_all(mean,na.rm = TRUE) %>% dplyr::slice(1)
colnames(dk) = gsub("_"," ",colnames(dk))%>% str_to_title()
dba = data[[i]][[4]] %>% mutate_all(mean,na.rm = TRUE) %>% dplyr::slice(1)
colnames(dba) = gsub("_"," ",colnames(dba))%>% str_to_title()
best_a[i] = da %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% t() %>% colnames()
worst_a[i] = da %>% t() %>% as.data.frame() %>% filter(V1 == min(V1)) %>% t() %>% colnames()
bk = dk %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% t() %>% colnames()
wk = dk %>% t() %>% as.data.frame() %>% filter(V1 == min(V1)) %>% t() %>% colnames()
if (is.null(bk)){
best_k[i]=dk %>% t() %>% as.data.frame() %>% filter(!is.na(V1)) %>% filter(V1 == max(V1)) %>% rownames()
}else{
best_k[i] = bk
}
if ((is.null(wk))){
worst_k[i]= dk %>% t() %>% as.data.frame() %>% filter(is.na(V1)) %>% rownames()
}else{
worst_k[i] = wk
}
best_ba[i] = dba %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% t() %>% colnames()
worst_ba[i] = dba %>% t() %>% as.data.frame() %>% filter(V1 == min(V1)) %>% t() %>% colnames()
da$type = names[i]
da$metric = paste("Accuracy",i)
dk$type = names[i]
dk$metric = paste("Kappa",i)
dk$type = names[i]
dk$metric = paste("Kappa",i)
dba$type = names[i]
dba$metric = paste("Balanced Accuracy",i)
datas = rbind(datas, da, dk, dba)
}
colnames(datas) = gsub("_"," ",colnames(datas))%>% str_to_title()
base = datas %>%group_by(Type) %>% gt(groupname_col = "Type", rowname_col = "Metric") %>%
tab_style(
style = list(cell_text(weight = "bold"), cell_fill("lightcyan")),
locations = cells_column_labels(columns = vars("Step Log","Full Log","Rf","Knn","Xgb" ))
)
for (i in 1:length(data)){
base = base %>%
tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),
locations = cells_body( columns = best_a[i], rows = Metric == paste("Accuracy",i) )
) %>%
tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),
locations =cells_body(columns = worst_a[i], rows = Metric == paste("Accuracy",i) )
) %>%
tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),
locations =cells_body(columns = best_k[i], rows = Metric == paste("Kappa",i) )
) %>%
tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),
locations =cells_body(columns = worst_k[i], rows = Metric == paste("Kappa",i) )
)%>%
tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),
locations =cells_body(columns = best_ba[i], rows = Metric == paste("Balanced Accuracy",i) )
) %>%
tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),
locations =cells_body(columns = worst_ba[i], rows = Metric == paste("Balanced Accuracy",i) )
)
}
base
}gt_best_in_data = function(model, ns){
model_best = data.frame()
for (i in 1:length(model)){
bam = model[[i]][[1]] %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% rownames()
bkm = model[[i]][[3]] %>% mutate_all(mean, na.rm = TRUE) %>%dplyr::slice(1) %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% rownames()
bbam = model[[i]][[4]] %>% mutate_all(mean, na.rm = TRUE) %>%dplyr::slice(1) %>% t() %>% as.data.frame() %>% filter(V1 == max(V1)) %>% rownames()
best_models = cbind(Accuracy = model[[i]][[1]][bam] %>% pull(), Kappa = model[[i]][[3]] %>% mutate_all(mean, na.rm = TRUE) %>%dplyr::slice(1) %>% pull(bkm), Balanced_Accuracy = model[[i]][[4]] %>% mutate_all(mean, na.rm = TRUE) %>%dplyr::slice(1) %>% pull(bbam)) %>% t() %>% as.data.frame()
best_models$type = c("Accuracy", "Kappa", "Balanced Accuracy")
best_models$dataset = ns[i]
best_models$model = gsub("_"," ",c(bam, bkm,bbam)) %>% str_to_title()
model_best = rbind(model_best,best_models)
}
model_best = model_best %>% rename(Score = V1)
colnames(model_best) = colnames(model_best) %>% str_to_title()
ba = model_best %>% group_by(Type) %>% summarise(max(Score)) %>% filter(Type == "Accuracy") %>% pull(`max(Score)`)
ba_m = model_best %>% filter(Score == ba)
bk = model_best %>% group_by(Type) %>% summarise(max(Score)) %>% filter(Type == "Kappa") %>% pull(`max(Score)`)
bk_m = model_best %>% filter(Score == bk)
bbm = model_best %>% group_by(Type) %>% summarise(max(Score)) %>% filter(Type == "Balanced Accuracy") %>% pull(`max(Score)`)
bbm_m = model_best %>% filter(Score == bbm)
wa = model_best %>% group_by(Type) %>% summarise(min(Score)) %>% filter(Type == "Accuracy") %>% pull(`min(Score)`)
wa_m = model_best %>% filter(Score == wa)
wk = model_best %>% group_by(Type) %>% summarise(min(Score)) %>% filter(Type == "Kappa") %>% pull(`min(Score)`)
wk_m = model_best %>% filter(Score == wk)
wbm = model_best %>% group_by(Type) %>% summarise(min(Score)) %>% filter(Type == "Balanced Accuracy") %>% pull(`min(Score)`)
wbm_m = model_best %>% filter(Score == wbm)
g1 = model_best %>% select(Dataset, Type, Score, Model) %>% dplyr::group_by(Type) %>% gt() %>%
tab_style(
style = list(cell_text(weight = "bold"), cell_fill("lightcyan")),
locations = list(cells_column_labels(columns = c("Dataset", "Score", "Model")))
) %>%
tab_style(style = list(cell_text(weight = "bold")),
locations = cells_row_groups(c("Accuracy","Kappa", "Balanced Accuracy")
)
)
g1 = g1 %>% tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),locations =cells_body(rows = Score == ba_m[1] %>% pull())
) %>% tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),locations =cells_body(rows = Score == bk_m[1] %>% pull() )
)%>% tab_style(
style = list(
cell_fill(color = "green"),
cell_text(weight = "bold")
),locations =cells_body(rows = Score == bbm_m[1] %>% pull())
)
g1 %>% tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),locations =cells_body(rows = Score == wa_m[1] %>% pull())
) %>% tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),locations =cells_body(rows = Score == wk_m[1] %>% pull() )
)%>% tab_style(
style = list(
cell_fill(color = "red"),
cell_text(weight = "bold")
),locations =cells_body(rows = Score == wbm_m[1] %>% pull())
)
}The single cell data only has a relatively small subset of all METABRIC patients. Thus, I will subset all the other datasets as well to only contain the METABRIC patients who are also in the single cell dataset. This reduces the sample size (from approximately 1900 to 355 after data cleaning) we have to train and test our models for these other datasets. However, this choice was made to allow for a controlled comparison between each model we produce such that it is trained and tested on the same set of patients. This enables us to the reduce the impact of individual variability on the performance of the model and thus which performs the best given the same set of patients.
clinical_data = load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/130510_illu_phenoData.Rdata")
clinical_data = illu.pData %>% mutate(sample_id = gsub(".","-", sample_id,fixed = TRUE))
sc_data = read_csv("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_data.csv")The clinical data contains information related to the age, subtype of cancer, histology, treatment a patient has received as well as other variables.
There are 51 patients without a clear indication as to whether they survived or didn’t (missing values as to their outcome). Thus, these patients will be excluded from the analysis.
clinical_data %>% filter(sample_id %in% (sc_data$metabricId %>% unique())) %>% select(-pgr.status, -time, -dataset) %>% select(event) %>% is.na() %>% sum()## [1] 51
clinical_data_processed = clinical_data %>% filter(sample_id %in% (sc_data$metabricId %>% unique())) %>% select(-pgr.status, -time, -dataset) %>% tidyr::drop_na(event)There are some columns with a few missing values. For the purposes of minimising the amount of patients dropped, I will not use these columns in building our model.
Figure 1: Missing values visual in the clinical dataset
The Image Mass Cytometry (IMC) data contains information on images of cores of breast cancer tissue with single cells identified on this tissue and labelled according 1 of 24 cell types. The location of each cell on a core was also marked with an x and y coordinate. Before feeding this dataset into my models, I transformed it into 2 different versions. The first counted what proportion of each cell type was in a patient’s breast cancer tissue core. The 2nd quantified the level of spatial pairwise interaction between every combination of two cell types in each patient using the spicyR package.
sc_props = sc_data %>% group_by(metabricId, core_id, ImageNumber,description) %>% summarise(n = n()) %>% mutate(freq = n / sum(n)) %>% ungroup() %>% group_by(metabricId, description, core_id) %>% summarise(avg = mean(freq))
sc_props_sliced = sc_props %>% group_by(metabricId, description) %>% dplyr::slice(1) %>% ungroup() %>% select(-core_id) %>% spread(description, avg) %>% mutate_all(funs(ifelse(is.na(.), 0, .))) %>% clean_names() %>% rename(sample_id = metabric_id)
sc_proportions_processed = inner_join(sc_props_sliced, clinical_data) %>% select(sc_props_sliced %>% colnames(), event) %>% na.omit() Below is a glimpse of the output from the spicyR package.
sc_data_copy = sc_data
colnames(sc_data_copy)[6:44] = paste0("Intensity_Mean_",colnames(sc_data_copy)[6:44])
sc_data_copy = sc_data_copy %>% dplyr:: rename(x = Location_Center_X, y = Location_Center_Y, cellType = pg_cluster, imageID = ImageNumber) %>% mutate(cellType = cellType) %>% as.data.frame()
pheno = sc_data_copy %>% rename(sample_id = metabricId) %>% select(sample_id, imageID) %>% unique() %>% merge(y = clinical_data, by = "sample_id", all.x = TRUE) %>% select(sample_id, imageID, event) %>% as.data.frame() %>% unique() %>% rename(subject = sample_id, condition = event)
cellExp = SegmentedCells(sc_data_copy,
cellTypeString = 'cellType',
intensityString = 'Intensity_Mean_')
cellType(cellExp) <- sc_data_copy$cellType
imagePheno(cellExp) <- pheno
cellSum <- cellSummary(cellExp)
head(cellSum)## DataFrame with 6 rows and 6 columns
## imageID cellID imageCellID x y cellType
## <factor> <character> <character> <numeric> <numeric> <factor>
## 1 527 cell_1 cell_1 161.833 6.0000 24
## 2 527 cell_2 cell_2 177.304 15.5391 24
## 3 527 cell_3 cell_3 293.519 19.8861 24
## 4 527 cell_4 cell_4 165.043 22.2101 20
## 5 527 cell_5 cell_5 108.881 28.5238 24
## 6 527 cell_6 cell_6 268.237 28.6619 24
# spicyTest <- spicy(cellExp, subject = "subject", condition = "condition")
#save(spicyTest,file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/spicyTest.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/spicyTest.Rdata")
associations = data.frame()
for (i in 1:length(spicyTest$pairwiseAssoc)){
associations = spicyTest$pairwiseAssoc[i] %>% as.data.frame() %>% t() %>% rbind(associations)
}
colnames(associations) = pheno$imageID
associations = associations %>% t() %>% as.data.frame()
colnames(associations) = gsub("X", "", colnames(associations))
sc_spatial_interactions = associations %>% mutate(imageID = associations %>% rownames() %>% as.numeric()) %>% inner_join(pheno) %>% tidyr::drop_na(condition) %>%
mutate_all(funs(ifelse(is.na(.), 0, .))) %>% rename(sample_id = subject,event = condition) %>%group_by(sample_id) %>% dplyr::slice(1) %>% ungroup() %>% select(-imageID)Spatial Interactions between cell types were calculated and p-value histograms of these interactions between clusters were plotted .
Figure 2: Unjusted p-value histogram of interactions between clusters.
Adjusted p-value histogram of interactions between clusters to correct for multiple testing. There are no significant interactions at a significance level of \(\alpha = 0.05\). This suggests that looking at spatial associations between cell types may be ineffective in predicting the survival outcome of patients. However, we will still proceed with fitting a model using this spatial interactions data.
Figure 3: Bonferroni adjusted p-value histogram of interactions between clusters.
From the correlation plot, there don’t seem to be many strong interaction associations in either the positive or negative direction.
Figure 4: Correlation plot of the level of interactions between clusters.
The Gene Expression data measured the expression of 35000 genes for each patient. The survival status for each patient was added by matching patient id’s to the clinical dataset and extracting this variable.
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/130510_illu_expr_matrix.Rdata")
ge_data= illu.data %>% as.data.frame()%>% mutate(sample_id = gsub(".","-",rownames(illu.data),fixed = TRUE)) %>% filter(sample_id %in% (sc_data$metabricId %>% unique()))
ge_data_processed = inner_join(ge_data, clinical_data) %>% select(ge_data %>% colnames, event) %>% tidyr::drop_na(event)In the dataset used train and test models there is an imbalance in the patient outcome in breast cancer patients. With there being many more patients that died relative to those who survived. Thus, accuracy may not be an appropriate metric in measuring the performance of a model. Instead using a Kappa score or Balanced accuracy metric may be more suitable.
A kappa score/statistic measures whether the classifier built performs better than a classifier that predicts an outcome based on the frequency of that class. A kappa value < 0 indicates no agreement , 0–0.20 as slight, 0.21–0.40 as fair, 0.41–0.60 as moderate, 0.61–0.80 as substantial, and 0.81–1 as almost perfect agreement (McHugh, 2012).
Balanced accuracy is calculated by taking the average of the proportion of correct predictions made for each class separately. Thus, unlike normal accuracy, the imbalanced class can have a much more substantial impact on the metric and will be lower if the model is performing poorly on the imbalanced class.
The models built on the clinical data had the best performance in terms of accuracy, balanced accuracy and kappa scores.
In all the models, accuracy and balanced accuracy are of a relatively disparate magnitude. This indicates that the model is over-fitting to the proportions of classes in the training data (i.e. predicts that most patients don’t survive a majority of the time due to this being the most frequent class).
In general the stepwise logistic models have the best accuracy across datatypes, while the random forest performs the worst. Depending on the dataset, the models with best kappa and balanced accuracy scores are the stepwise logistic, logistic and random forest models.
Within the models built on the clinical data, the one created through stepwise feature selection on a logistic model had the best average accuracy score of 0.79 and second best kappa and balanced accuracy score of 0.23 and 0.59 respectively.
This kappa score indicates that there is a fair amount of agreement between our model predictions and the actual outcome when accounting for the imbalance of survivors to non-survivors. Although there is a fair amount of agreement, this score likely means that the model is to some extent still predicting classes according to their frequency. That is, the model forecasts that a majority of patients don’t survive as they are more prominent in the training data and thus are being given a higher weight when predictions are being made.
The stepwise model also seems to have a similar level of variability to the other models for both accuracy and kappa. However, from the boxplots, the variability for the kappa scores seem to be higher than for accuracy in the clinical models.
# clinical_results = model_cv_feature(clinical_data_processed, k=10, ncv = 5, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020)
#
# save(clinical_results,file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/clinical_results.Rdata")(ggplot(bind_rows(clinical_results[2] %>% as.data.frame() %>% gather() %>% mutate(df="Accuracy"), clinical_results[3] %>% as.data.frame() %>% gather() %>% mutate(df="Kappa"), clinical_results[4] %>% as.data.frame() %>% gather() %>% mutate(df="Balanced Accuracy") ),
aes(x = key,y = value,fill = key)) +
geom_boxplot() +
facet_wrap(~ df, scales="free") + labs(x = "", y = "")) %>% ggplotly()Figure 5: Boxplots of K-fold CV on clinical models
gt_create(clinical_results) %>% tab_source_note("Table 1: Average performance from K-fold CV on clinical models")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Accuracy | 0.7822535 | 0.7729577 | 0.7611268 | 0.7692958 | 0.7670423 |
| Kappa | 0.2281237 | 0.2315812 | 0.1836598 | 0.1925367 | 0.1753885 |
| Balanced Accuracy | 0.5943220 | 0.6007525 | 0.5800777 | 0.5809230 | 0.5765252 |
| Table 1: Average performance from K-fold CV on clinical models | |||||
The stepwise logistic model has the best accuracy for this dataset with a score of 0.76. This score is comparable to the accuracy from the worst performing model in clinical dataset (random forest model). Furthermore, the best kappa and balanced accuracy scores from the random forest model built of single-cell (sc) proportions are worse than scores outputted from any of the models built from the clinical dataset. Thus, although a 0.1 kappa score indicates that there is a slight agreement between our model and predictions when accounting for class imbalances, this isn’t much better than using a model predicting outcomes according to class frequencies.
# sc_proportions_processed_results = model_cv_feature(sc_proportions_processed, k=10, ncv = 5, top = 10,fs = FALSE, categorical_exist = FALSE, seed =2020)
#
# save(sc_proportions_processed_results,file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_proportions_processed_results.Rdata")(ggplot(bind_rows(sc_proportions_processed_results[2] %>% as.data.frame() %>% gather() %>% mutate(df="Accuracy"), sc_proportions_processed_results[3] %>% as.data.frame() %>% gather() %>% mutate(df="Kappa"), sc_proportions_processed_results[4] %>% as.data.frame() %>% gather() %>% mutate(df="Balanced Accuracy")),
aes(x = key,y = value,fill = key)) +
geom_boxplot() +
facet_wrap(~ df, scales="free")+ labs(x = "", y = "")) %>% ggplotly()Figure 6: Boxplots of K-fold CV on single cell proportion models
gt_create(sc_proportions_processed_results) %>% tab_source_note("Table 2: Average performance from K-fold CV on single cell proportion models")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Accuracy | 0.7560563380 | 0.74591549 | 0.7180282 | 0.72760563 | 0.73577465 |
| Kappa | -0.0005815844 | 0.01557471 | 0.0939939 | 0.02881258 | 0.03615437 |
| Balanced Accuracy | 0.4997749415 | 0.50628401 | 0.5441350 | 0.51206119 | 0.51540678 |
| Table 2: Average performance from K-fold CV on single cell proportion models | |||||
The SC - Spatial Interactions data had many interactions which would be computationally difficult to build models on. Thus, I performed feature selection using the limma package to statistically choose the top 20 differentially expressed interactions between survivors and non-survivors. This process was incorporated into the CV loop to prevent data leakage from the testing to training folds and give a more accurate representation of how our models would perform on unseen data.
The models built on the interactions between single cells performs even worse than the sc proportions models for all metrics. A negative kappa score indicates that the model is performing worse than a model that just predicts survival outcome according to the frequency these classes appear.
# sc_spatial_interactions_results = model_cv_feature(sc_spatial_interactions, k=10, ncv = 5, top = 20,fs = TRUE, categorical_exist = FALSE, seed =2020)
#
# save(sc_spatial_interactions_results,file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_spatial_interactions_results.Rdata")(ggplot(bind_rows(sc_spatial_interactions_results[2] %>% as.data.frame() %>% gather() %>% mutate(df="Accuracy"), sc_spatial_interactions_results[3] %>% as.data.frame() %>% gather() %>% mutate(df="Kappa"), sc_spatial_interactions_results[4] %>% as.data.frame() %>% gather() %>% mutate(df="Balanced Accuracy") ),
aes(x = key,y = value,fill = key)) +
geom_boxplot() +
facet_wrap(~ df, scales="free")+ labs(x = "", y = "")) %>% ggplotly()Figure 7: Boxplots of K-fold CV on single cell spatial interaction models.
gt_create(sc_spatial_interactions_results) %>% tab_source_note("Table 3: Average performance from K-fold CV on single cell spatial interaction models")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Accuracy | 0.72422535 | 0.72309859 | 0.69492958 | 0.71267606 | 0.71436620 |
| Kappa | -0.01815017 | -0.01438837 | -0.02471922 | -0.02666036 | -0.03337648 |
| Balanced Accuracy | 0.49269775 | 0.49366447 | 0.48780533 | 0.48911381 | 0.48659130 |
| Table 3: Average performance from K-fold CV on single cell spatial interaction models | |||||
The Gene Expression data included 35000 genes for each patient which would be computationally difficult to build models on. Thus, I performed feature selection using the limma package to statistically choose the top 10 differentially expressed genes between survivors and non-survivors. This process was incorporated into the CV loop to prevent data leakage from the testing to training folds and give a more accurate representation of how our models would perform on unseen data.
The stepwise model built on the gene expression data has an almost identical performance to the models built on sc proportions. However, the accuracy, kappa and balanced accuracy scores are slightly lower.
# ge_results = model_cv_feature(ge_data_processed, k=10, ncv = 5, top = 10,fs = TRUE, categorical_exist = FALSE, seed =2020)
#
# save(ge_results,file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/ge_results.Rdata")(ggplot(bind_rows(ge_results[2] %>% as.data.frame() %>% gather() %>% mutate(df="Accuracy"), ge_results[3] %>% as.data.frame() %>% gather() %>% mutate(df="Kappa"), ge_results[4] %>% as.data.frame() %>% gather() %>% mutate(df="Balanced Accuracy")),
aes(x = key,y = value,fill = key)) +
geom_boxplot() +
facet_wrap(~ df, scales="free")+ labs(x = "", y = "")) %>% ggplotly()Figure 8: Boxplots of K-fold CV on gene expression models
gt_create(ge_results) %>% tab_source_note("Table 4: Average performance from K-fold CV on gene expression models")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Accuracy | 0.73295775 | 0.73267606 | 0.70281690 | 0.71577465 | 0.71464789 |
| Kappa | 0.06483614 | 0.05816332 | 0.03641227 | 0.03829695 | 0.04099898 |
| Balanced Accuracy | 0.52855187 | 0.52590733 | 0.51631802 | 0.51663239 | 0.51818389 |
| Table 4: Average performance from K-fold CV on gene expression models | |||||
The best performing clinical models have a moderately good accuracy and kappa score (this metric investigates performance when there is a class imbalance i.e. significantly more patients who die than survive) when predicting patients who have been diagnosed with the Luminal A and HER2 cancer sub-type. This suggests that these models are performing fairly well when accounting for the class imbalance.
The XGB model predicting survival outcome in patients diagnosed with a Normal subtype of breast cancer have a moderately high accuracy. However, the kappa score is relatively low compared to the models built on the Lumincal A and HER2 subtypes. Thus, this kappa score indicates that there is slight agreement between the model prediction and outcome when accounting for the class imbalance.
Models predicting the outcome in patients with an Luminal B and Basal outcome have a moderate accuracy but relatively poor kappa score. This indicates that these two models aren’t performing well when accounting for the class imbalance.
clinical_subtypes = clinical_data %>% filter(sample_id %in% (sc_data$metabricId %>% unique())) %>% select(-pgr.status, -time, -dataset) %>% tidyr::drop_na(event, subtype) %>% select(-er.status, -grade, -stage, -node.status)
lum_a = clinical_subtypes %>% filter(subtype =="Luminal A" ) %>% select(-subtype)
lum_b = clinical_subtypes %>% filter(subtype == "Luminal B") %>% select(-subtype)
normal= clinical_subtypes %>% filter(subtype == "Normal") %>% select(-subtype)
basal = clinical_subtypes %>% filter(subtype == "Basal") %>% select(-subtype)
her2 = clinical_subtypes %>% filter(subtype == "HER2") %>% select(-subtype)
cl_st = list(lum_a, lum_b, normal, basal, her2)
# lum_a_results = model_cv_feature(lum_a, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020)
# lum_b_results = model_cv_feature(lum_b, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020)
# normal_results = model_cv_feature(normal, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020)
# basal_results = model_cv_feature(basal, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020)
# her2_results = model_cv_feature(her2, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020)
#
# clinical_subtype_results = list(lum_a_results,her2_results,normal_results,lum_b_results,basal_results)
#
# save(clinical_subtype_results, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/clinical_subtype_results.Rdata")load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/clinical_subtype_results.Rdata")
subtype_names = c("Luminal A", "HER2","Normal", "Luminal B","Basal" )
gt_create_multi(clinical_subtype_results,subtype_names)%>% tab_source_note("Table 5: Average performance from K-fold CV on clinical models by subtype")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Luminal A | |||||
| Accuracy 1 | 0.88262411 | 0.86899476 | 0.897826087 | 0.869071847 | 0.883379587 |
| Kappa 1 | 0.49849449 | 0.41998407 | 0.533687654 | 0.319000179 | 0.455226991 |
| Balanced Accuracy 1 | 0.73547897 | 0.69515981 | 0.739636206 | 0.633556601 | 0.704499966 |
| HER2 | |||||
| Accuracy 2 | 0.75833333 | 0.76342593 | 0.638425926 | 0.717592593 | 0.652777778 |
| Kappa 2 | 0.42306744 | 0.44821798 | 0.000000000 | 0.327580981 | 0.161883926 |
| Balanced Accuracy 2 | 0.72111111 | 0.71912698 | 0.500000000 | 0.665912698 | 0.586349206 |
| Normal | |||||
| Accuracy 3 | 0.77300195 | 0.76939571 | 0.857309942 | 0.835964912 | 0.826705653 |
| Kappa 3 | 0.05230267 | 0.09160788 | 0.000000000 | -0.025406898 | 0.257672458 |
| Balanced Accuracy 3 | 0.54889288 | 0.56052346 | 0.500000000 | 0.489057734 | 0.645372839 |
| Luminal B | |||||
| Accuracy 4 | 0.67179803 | 0.67668309 | 0.633990148 | 0.691954023 | 0.632512315 |
| Kappa 4 | 0.11504906 | 0.12022009 | 0.088774259 | 0.152119709 | 0.013119336 |
| Balanced Accuracy 4 | 0.55782646 | 0.56243398 | 0.547232300 | 0.568801098 | 0.505285780 |
| Basal | |||||
| Accuracy 5 | 0.60232843 | 0.58431373 | 0.605514706 | 0.622549020 | 0.577696078 |
| Kappa 5 | 0.05089953 | 0.07226305 | -0.002943813 | -0.004598079 | 0.002529486 |
| Balanced Accuracy 5 | 0.52661612 | 0.54252225 | 0.488842130 | 0.500277454 | 0.504223415 |
| Table 5: Average performance from K-fold CV on clinical models by subtype | |||||
# ge_data_subtypes = inner_join(ge_data, clinical_data) %>% select(ge_data %>% colnames, event, subtype) %>% tidyr::drop_na(event)
#
#
# lum_a = ge_data_subtypes %>% filter(subtype =="Luminal A" ) %>% select(-subtype)
# lum_b = ge_data_subtypes %>% filter(subtype == "Luminal B") %>% select(-subtype)
# normal= ge_data_subtypes %>% filter(subtype == "Normal") %>% select(-subtype)
# basal = ge_data_subtypes %>% filter(subtype == "Basal") %>% select(-subtype)
# her2 = ge_data_subtypes %>% filter(subtype == "HER2") %>% select(-subtype)
#
# lum_a_results = model_cv_feature(lum_a, k=10, ncv = 3, top = 10,fs = TRUE, categorical_exist = FALSE, seed =2020)
# lum_b_results = model_cv_feature(lum_b, k=10, ncv = 3, top = 10,fs = TRUE, categorical_exist = FALSE, seed =2020)
# normal_results = model_cv_feature(normal, k=10, ncv = 3, top = 10,fs = TRUE, categorical_exist = FALSE, seed =2020)
# basal_results = model_cv_feature(basal, k=10, ncv = 3, top = 10,fs = TRUE, categorical_exist = FALSE, seed =2020)
# her2_results = model_cv_feature(her2, k=10, ncv = 3, top = 10,fs = TRUE, categorical_exist = FALSE, seed =2020)
#
#
# ge_subtype_results = list(lum_a_results,her2_results,normal_results,lum_b_results,basal_results)
#
# save(ge_subtype_results, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/ge_subtype_results.Rdata")Models built on gene expression data seem to perform significantly worse relative to the models built on clinical model with respect to all breast cancer subtypes in terms of both accuracy and kappa.
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/ge_subtype_results.Rdata")
gt_create_multi(ge_subtype_results,subtype_names)%>% tab_source_note("Table 6: Average performance from K-fold CV on gene expression models by subtype")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Luminal A | |||||
| Accuracy 1 | 0.755442492 | 0.74909035 | 0.75197348 | 0.824529756 | 0.76193340 |
| Kappa 1 | -0.025814894 | -0.03244526 | -0.01706039 | -0.004741813 | -0.00215457 |
| Balanced Accuracy 1 | 0.487408949 | 0.48258961 | 0.49067946 | 0.499587576 | 0.50203611 |
| HER2 | |||||
| Accuracy 2 | 0.441203704 | 0.44953704 | 0.63935185 | 0.561574074 | 0.46712963 |
| Kappa 2 | -0.128226951 | -0.12185200 | 0.00000000 | -0.004113541 | -0.10820846 |
| Balanced Accuracy 2 | 0.426130952 | 0.44257937 | 0.50000000 | 0.489107143 | 0.43728175 |
| Normal | |||||
| Accuracy 3 | 0.784307992 | 0.79463938 | 0.73567251 | 0.844541910 | 0.77134503 |
| Kappa 3 | 0.025055040 | 0.03955725 | 0.01458779 | -0.014661693 | -0.01187107 |
| Balanced Accuracy 3 | 0.524375209 | 0.52273483 | 0.53113523 | 0.492946623 | 0.48636255 |
| Luminal B | |||||
| Accuracy 4 | 0.594293924 | 0.58842365 | 0.61013957 | 0.634729064 | 0.61486043 |
| Kappa 4 | -0.003783958 | -0.01068196 | 0.05519265 | -0.023795580 | 0.03801938 |
| Balanced Accuracy 4 | 0.500503317 | 0.49814335 | 0.52286452 | 0.491740534 | 0.51961060 |
| Basal | |||||
| Accuracy 5 | 0.532598039 | 0.55735294 | 0.49215686 | 0.605514706 | 0.52034314 |
| Kappa 5 | -0.035587857 | 0.01777176 | -0.03346215 | 0.044788603 | -0.03788754 |
| Balanced Accuracy 5 | 0.481862674 | 0.51388792 | 0.48053026 | 0.526561124 | 0.47748275 |
| Table 6: Average performance from K-fold CV on gene expression models by subtype | |||||
Models using SC proportions data don’t fair any better with similar or even worse performance than the models built on gene expression data.
# sc_props_subtypes = inner_join(sc_props_sliced, clinical_data) %>% select(sc_props_sliced %>% colnames(), subtype,event) %>% na.omit()
#
#
# lum_a = sc_props_subtypes %>% filter(subtype =="Luminal A" ) %>% select(-subtype)
# lum_b = sc_props_subtypes %>% filter(subtype == "Luminal B") %>% select(-subtype)
# normal= sc_props_subtypes%>% filter(subtype == "Normal") %>% select(-subtype)
# basal = sc_props_subtypes %>% filter(subtype == "Basal") %>% select(-subtype)
# her2 = sc_props_subtypes %>% filter(subtype == "HER2") %>% select(-subtype)
#
#
# lum_a_results = model_cv_feature(lum_a, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = FALSE, seed =2020)
# lum_b_results = model_cv_feature(lum_b, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = FALSE, seed =2020)
# normal_results = model_cv_feature(normal, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = FALSE, seed =2020)
# basal_results = model_cv_feature(basal, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = FALSE, seed =2020)
# her2_results = model_cv_feature(her2, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = FALSE, seed =2020)
#
#
#
# sc_props_subtypes_results = list(lum_a_results,her2_results,normal_results,lum_b_results,basal_results)
#
# save(sc_props_subtypes_results, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_props_subtypes_results.Rdata")load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_props_subtypes_results.Rdata")
gt_create_multi(sc_props_subtypes_results, subtype_names )%>% tab_source_note("Table 7: Average performance from K-fold CV on single cell proportion models by subtype")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Luminal A | |||||
| Accuracy 1 | 0.77485353 | 0.752605612 | 0.797748998 | 0.841628122 | 0.80063213 |
| Kappa 1 | 0.02724745 | 0.013390670 | 0.026564229 | -0.001629895 | 0.07081405 |
| Balanced Accuracy 1 | 0.50931098 | 0.501039432 | 0.509241412 | 0.499662472 | 0.52941986 |
| HER2 | |||||
| Accuracy 2 | 0.48240741 | 0.504166667 | 0.639351852 | 0.489814815 | 0.51296296 |
| Kappa 2 | -0.04946971 | -0.010548822 | 0.000000000 | -0.066625696 | -0.01561294 |
| Balanced Accuracy 2 | 0.47882937 | 0.501428571 | 0.500000000 | 0.472996032 | 0.50444444 |
| Normal | |||||
| Accuracy 3 | 0.66325536 | 0.610526316 | 0.798927875 | 0.853411306 | 0.71647173 |
| Kappa 3 | -0.06122751 | -0.001782345 | -0.020367340 | -0.003703704 | -0.03177534 |
| Balanced Accuracy 3 | 0.45756147 | 0.502827965 | 0.475000000 | 0.498148148 | 0.48977669 |
| Luminal B | |||||
| Accuracy 4 | 0.54470443 | 0.483661741 | 0.560426929 | 0.540599343 | 0.61941708 |
| Kappa 4 | -0.08567397 | -0.121327332 | -0.009901023 | 0.019272748 | 0.03903583 |
| Balanced Accuracy 4 | 0.45350272 | 0.427056378 | 0.495405863 | 0.516904577 | 0.52169831 |
| Basal | |||||
| Accuracy 5 | 0.46740196 | 0.467156863 | 0.547303922 | 0.582352941 | 0.58933824 |
| Kappa 5 | -0.09826132 | -0.063326581 | 0.004035498 | -0.030104872 | 0.07118454 |
| Balanced Accuracy 5 | 0.44602351 | 0.463525225 | 0.504479779 | 0.489876281 | 0.53754431 |
| Table 7: Average performance from K-fold CV on single cell proportion models by subtype | |||||
Models using SC spatial interactions have the worst performance for all subtypes when compared to the models built on other data.
# sc_interactions_subtypes = sc_spatial_interactions %>% inner_join(clinical_data) %>% select(subtype,(sc_spatial_interactions %>% colnames()))
#
# lum_a = sc_interactions_subtypes %>% filter(subtype =="Luminal A" ) %>% select(-subtype)
# lum_b = sc_interactions_subtypes %>% filter(subtype == "Luminal B") %>% select(-subtype)
# normal= sc_interactions_subtypes %>% filter(subtype == "Normal") %>% select(-subtype)
# basal = sc_interactions_subtypes %>% filter(subtype == "Basal") %>% select(-subtype)
# her2 = sc_interactions_subtypes %>% filter(subtype == "HER2") %>% select(-subtype)
#
#
# lum_a_results = model_cv_feature(lum_a, k=10, ncv = 3, top = 20,fs = TRUE, categorical_exist = FALSE, seed =2020)
# lum_b_results = model_cv_feature(lum_b, k=10, ncv = 3, top = 20,fs = TRUE, categorical_exist = FALSE, seed =2020)
# normal_results = model_cv_feature(normal, k=10, ncv = 3, top = 20,fs = TRUE, categorical_exist = FALSE, seed =2020)
# basal_results = model_cv_feature(basal, k=10, ncv = 3, top = 20,fs = TRUE, categorical_exist = FALSE, seed =2020)
# her2_results = model_cv_feature(her2, k=10, ncv = 4, top = 20,fs = TRUE, categorical_exist = FALSE, seed =2020)
#
#
#
# sc_interactions_subtypes_results = list(lum_a_results,her2_results,normal_results,lum_b_results,basal_results)
#
# save(sc_interactions_subtypes_results, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_interactions_subtypes_results .Rdata")load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/sc_interactions_subtypes_results .Rdata")
gt_create_multi(sc_interactions_subtypes_results, subtype_names)%>% tab_source_note("Table 8: Average performance from K-fold CV on single cell spatial interaction models by subtype")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Luminal A | |||||
| Accuracy 1 | 0.75248227 | 0.71079248 | 0.77693494 | 0.806490903 | 0.778430466 |
| Kappa 1 | -0.04546962 | -0.04481927 | -0.03810129 | 0.004059123 | -0.004917488 |
| Balanced Accuracy 1 | 0.48150955 | 0.47760170 | 0.48084908 | 0.502043005 | 0.494345120 |
| HER2 | |||||
| Accuracy 2 | 0.50416667 | 0.52023810 | 0.64047619 | 0.467261905 | 0.510714286 |
| Kappa 2 | -0.07729603 | -0.03797445 | 0.00000000 | -0.132189795 | 0.011188734 |
| Balanced Accuracy 2 | 0.44965278 | 0.47256944 | 0.50000000 | 0.414027778 | 0.488333333 |
| Normal | |||||
| Accuracy 3 | 0.68918129 | 0.63382066 | 0.74366472 | 0.828460039 | 0.737719298 |
| Kappa 3 | -0.07219146 | -0.08426513 | -0.03463858 | 0.029651619 | -0.062002002 |
| Balanced Accuracy 3 | 0.44855567 | 0.41969324 | 0.45942810 | 0.525422113 | 0.459845938 |
| Luminal B | |||||
| Accuracy 4 | 0.60451560 | 0.57446634 | 0.63435961 | 0.688218391 | 0.648440066 |
| Kappa 4 | 0.04894559 | 0.02092906 | 0.08747965 | 0.105454724 | 0.065006259 |
| Balanced Accuracy 4 | 0.52785409 | 0.51363518 | 0.54358859 | 0.548560666 | 0.535796959 |
| Basal | |||||
| Accuracy 5 | 0.46997549 | 0.48823529 | 0.48455882 | 0.563480392 | 0.532352941 |
| Kappa 5 | -0.09054602 | -0.07119947 | -0.08202388 | -0.048015655 | -0.071102142 |
| Balanced Accuracy 5 | 0.44507298 | 0.45500768 | 0.45474262 | 0.473087329 | 0.464682956 |
| Table 8: Average performance from K-fold CV on single cell spatial interaction models by subtype | |||||
When predicting the survival outcome in patients diagnosed with a partcular subtype of breast cancer, it seems as the clinical model has the most predictive power due to age and certain treatment variables consistently retained in the variable inclusion plot when minimising LOSS. The latter implies that some treatments are likely to play a pivotal role in a patients survival as they provide a differentiating factor that improves the models performance. The exception to this is individuals of an Normal (age doesn’t seem as crucial) and Basal breast cancer subtype (no variables seem crucial).
In the variable inclusion plot the value of the penalty coefficient is varied and we observe whether each parameter is selected in the model that minimises the loss plus penalty in weighted bootstrap samples. This is done to see the effect of slight deviations in our samples in the selection of various parameters. If stable, the probability a parameter is selected should be relatively consistent across penalty coefficients. Additionally, if certain treatments appear over others consistently, this can give a level of indication as to whether certain treatment(s) are a differentiating factor in patient survival prognosis and thus also if the treatment is effective.
From the variable inclusion plot (VIP), on the aggregate clinical data, there is only one variable that is selected consistently in the best performing models as the penalty term increases on the x-axis. This variable is age. Intuitively, age would play a major role in the survival of a patient as individuals are more susceptible to death and less likely to respond to treatments as they age.
It is interesting that no treatment tends to be selected with high probability as the penalty term increases. This, is likely due to different treatments being effective on different subtypes of patients, acting as a confounding variable. Thus, this would suggest that our logistic model’s needs to account for the interaction between patient subtypes and the treatment administered. This will implicitly be investigated in the following sections as we investigate model stability on models built from data stratified by subtype.
# bw.glm <- glm(event ~ ., family = binomial, data =clinical_data_processed %>% select(-sample_id))
# pihat <- fitted(bw.glm)
# r <- residuals(bw.glm, type = "working")
# z <- log(pihat / (1 - pihat)) + r
# v <- pihat * (1 - pihat)
# nbwt <- clinical_data_processed%>% select(-sample_id, -event)
# nbwt$z <- z
# nbwt$low <- NULL
# bw.lm <- lm(z ~ ., data = nbwt, weights = v)
# cl.bw.lm.vis <- vis(bw.lm, B = 150, seed = 1)
# save(cl.bw.lm.vis, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/cl.bw.lm.vis.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/cl.bw.lm.vis.Rdata")
(plot(cl.bw.lm.vis, which = "vip", interactive = FALSE)) %>% ggplotly()Figure 9: Variable Inclusion plot on models built on the entire clinical dataset
As per the model built on the aggregate data, age continues to be an integral to the stability of a model when predicting outcomes in Luminal A breast cancer patients. This highlights that age likely plays a major role in the survival off a patient. The treatment CT.HT.RT is also another variable that is selected with a comparable level of probability to age. This agrees with what was previously hypothesised.That is, when investigating particular subtypes, certain treatment’s are more likely to play a crucial role in differentiating whether a patient survives or not. Since models predicting the outcome of individuals of this subtype perform relatively well, it seems as though CT.HT.RT should continue to be administered to Luminal A patients as a way of combating their breast cancer. To improve the outreach of this treatment, government and specialists should work closely with each other as a means to lower the cost of providing this treatment and improving accessibility.
# bw.glm <- glm(event ~ ., family = binomial, data =cl_st[[1]]%>% select(-sample_id))
# pihat <- fitted(bw.glm)
# r <- residuals(bw.glm, type = "working")
# z <- log(pihat / (1 - pihat)) + r
# v <- pihat * (1 - pihat)
# nbwt <- cl_st[[1]] %>% select(-sample_id, -event)
# nbwt$z <- z
# nbwt$low <- NULL
# bw.lm <- lm(z ~ ., data = nbwt, weights = v)
# luma.bw.lm.vis <- vis(bw.lm, B = 150, seed = 1)
#
# save(luma.bw.lm.vis, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/luma.bw.lm.vis.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/luma.bw.lm.vis.Rdata")
(plot(luma.bw.lm.vis, which = "vip", interactive = FALSE)) %>% ggplotly()Figure 10: Variable Inclusion plot on models built on the Luminal A subtype clinical dataset
For HER2 patients, all variables except RV and treatmentRT are selected with high probability throughout. This suggests that multiple treatments and clinical factors play a major role in determining whether a patient of this subtype survives. However, since being admistered an RT treatment doesn’t seem to be a differentiating factor in determining the survival of a patient (as it is rarely included in any of the best performing models that minimise LOSS), this indicates that individuals and breast cancer specialists should avoid this type of treatment for patients with a HER2 subtype of cancer.
# bw.glm <- glm(event ~ ., family = binomial, data =cl_st[[5]]%>% select(-sample_id))
# pihat <- fitted(bw.glm)
# r <- residuals(bw.glm, type = "working")
# z <- log(pihat / (1 - pihat)) + r
# v <- pihat * (1 - pihat)
# nbwt <- cl_st[[5]] %>% select(-sample_id, -event)
# nbwt$z <- z
# nbwt$low <- NULL
# bw.lm <- lm(z ~ ., data = nbwt, weights = v)
# her2.bw.lm.vis <- vis(bw.lm, B = 150, seed = 1)
#
# save(her2.bw.lm.vis, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/her2.bw.lm.vis.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/her2.bw.lm.vis.Rdata")
(plot(her2.bw.lm.vis, which = "vip", interactive = FALSE)) %>% ggplotly()Figure 11: Variable Inclusion plot on models built on the HER2 subtype clinical dataset
For Normal breast cancer patients, surprisingly age doesn’t seem to be integral to the stability of a model that predicts the survival outcome in these patients. This is because the probability age is selected follows a similar path to the redudant variable (RV) which is unrelated to an individuals survival. Instead, being treated with CT.HT.RT and the size of a patients tumour seem to govern the survivial of patients of a normal subtype of cancer. However, these results may need to be taken with caution as there was only a fair agreement (kappa = 0.23) with the clinical model’s prediction and actual outcome when accounting for the imbalance of classes when K-fold CV was performed.
# bw.glm <- glm(event ~ ., family = binomial, data =cl_st[[3]]%>% select(-sample_id))
# pihat <- fitted(bw.glm)
# r <- residuals(bw.glm, type = "working")
# z <- log(pihat / (1 - pihat)) + r
# v <- pihat * (1 - pihat)
# nbwt <- cl_st[[3]] %>% select(-sample_id, -event)
# nbwt$z <- z
# nbwt$low <- NULL
# bw.lm <- lm(z ~ ., data = nbwt, weights = v)
# normal.bw.lm.vis <- vis(bw.lm, B = 150, seed = 1)
#
# save(normal.bw.lm.vis, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/normal.bw.lm.vis.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/normal.bw.lm.vis.Rdata")
(plot(normal.bw.lm.vis, which = "vip", interactive = FALSE)) %>% ggplotly()Figure 12: Variable Inclusion plot on models built on the Normal subtype clinical dataset
As per the Luminal A and HER2 subtype, age is incorporated in stable models with a relatively high level of probability when compared to other variables throughout the range of penalty values. treatment.HT seems to be of a similar level of importance until a penalty coefficient of 5, where its probability of being selected then sharply declines. This is in contrast to age which only declines slowly. This suggests that the treatment HT may only have a minor role in differentiating whether a Luminal B patient survives. Thus, this indicates that additional research into more effective treatments must be conducted to improve patient outcomes and prognosis of patients of a Luminal B breast cancer subtype (accentuated by models predicting outcomes in these patients only performing “fair” in terms of a kappa statistic).
# bw.glm <- glm(event ~ ., family = binomial, data =cl_st[[2]]%>% select(-sample_id))
# pihat <- fitted(bw.glm)
# r <- residuals(bw.glm, type = "working")
# z <- log(pihat / (1 - pihat)) + r
# v <- pihat * (1 - pihat)
# nbwt <- cl_st[[2]] %>% select(-sample_id, -event)
# nbwt$z <- z
# nbwt$low <- NULL
# bw.lm <- lm(z ~ ., data = nbwt, weights = v)
# lumb.bw.lm.vis <- vis(bw.lm, B = 150, seed = 1)
#
# save(lumb.bw.lm.vis, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/lumb.bw.lm.vis.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/lumb.bw.lm.vis.Rdata")
(plot(lumb.bw.lm.vis, which = "vip", interactive = FALSE)) %>% ggplotly()Figure 13: Variable Inclusion plot on models built on the Luminal B subtype clinical dataset
RV is a redundant variable generated by the mplot package that is uncorrelated to the patient’s survival status. Thus, any values near or below this variable in the variable inclusion plot (VIP) are unlikely to be incorporated into a stable model that is able to predict the patients survival outcome. From the VIP below, we see that all the variables are below RV for the Basal subtype. This indicates that all variables are more or less redundant when predicting survival outcome in patients with Basal breast cancer. This is consistent with the results in previous sections, where all models performed poorly with regards to a kappa score when predicting this subtype. This suggests that the treatments used for patients of this subtype don’t necessarily differentiate whether they survive or not. Thus, further research needs to be done with relation to the factors that influence a Basal individuals survival and the treatments that prove effective.
# bw.glm <- glm(event ~ ., family = binomial, data =cl_st[[4]]%>% select(-sample_id))
# pihat <- fitted(bw.glm)
# r <- residuals(bw.glm, type = "working")
# z <- log(pihat / (1 - pihat)) + r
# v <- pihat * (1 - pihat)
# nbwt <- cl_st[[4]] %>% select(-sample_id, -event)
# nbwt$z <- z
# nbwt$low <- NULL
# bw.lm <- lm(z ~ ., data = nbwt, weights = v)
# basal.bw.lm.vis <- vis(bw.lm, B = 150, seed = 1)
#
# save(basal.bw.lm.vis, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/basal.bw.lm.vis.Rdata")
load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/basal.bw.lm.vis.Rdata")
(plot(basal.bw.lm.vis, which = "vip", interactive = FALSE)) %>% ggplotly()Figure 14: Variable Inclusion plot on models built on the Basal subtype clinical dataset
I restricted my attention to just models built on the clinical dataset as they had the best prospective performance.
I used weights in all the classification models (with the exception of KNN which does not accept weights) to deal with the class imbalance. These weights increases the algorithms priority in correctly predicting the outcome of the survivors in the dataset (as these individuals are the minority class).
The accuracy in the best performing weighted model was (XGB) was lower than the best performing un-weighted model (stepwise logistic) on the aggregated data. However, the kappa and balanced accuracy from the weighted logistic regression model was slightly better than for the un-weighted models.
Thus, if our main goal is to have a balance between correctly predicting survivors and non-survivors, this weighted model should be used instead on the aggregate clinical data. This will to some extent mitigate the logistic regression models overfitting to the proportion of classes in a dataset.
# w_clinical_results = model_cv_feature(clinical_data_processed, k=10, ncv = 5, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = TRUE)
#
# save(w_clinical_results,file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/w_clinical_results.Rdata")load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/w_clinical_results.Rdata")
gt_create(w_clinical_results) %>% tab_source_note("Table 9: K-fold CV clinical model performance with weighted classification models") | Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Accuracy | 0.656338 | 0.6828169 | 0.6461972 | 0.7692958 | 0.7774648 |
| Kappa | 0.000000 | 0.2681863 | 0.2108437 | 0.1925367 | 0.0000000 |
| Balanced Accuracy | 0.500000 | 0.6685242 | 0.6371198 | 0.5809230 | 0.5000000 |
| Table 9: K-fold CV clinical model performance with weighted classification models | |||||
Using weights in the models built per subtype yielded no improvement to accuracy, balanced accuracy and kappa for any of the subtypes.
Thus, other techniques such as hyperparameter optimisation may need to be turned to improve model performance.
# w_lum_a_results = model_cv_feature(lum_a, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = TRUE)
# w_lum_b_results = model_cv_feature(lum_b, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = TRUE)
# w_normal_results = model_cv_feature(normal, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = TRUE)
# w_basal_results = model_cv_feature(basal, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = TRUE)
# w_her2_results = model_cv_feature(her2, k=10, ncv = 3, top = 10,fs = FALSE, categorical_exist = TRUE, seed =2020, w = TRUE)
#
# w_clinical_subtype_results = list(w_lum_a_results,w_her2_results,w_normal_results,w_lum_b_results,w_basal_results)
#
#save(w_clinical_subtype_results, file = "/dskh/nobackup/biostat/datasets/spatial/metabric/idr/w_clinical_subtype_results.Rdata")load("/dskh/nobackup/biostat/datasets/spatial/metabric/idr/w_clinical_subtype_results.Rdata")
subtype_names = c("Luminal A", "HER2","Normal", "Luminal B","Basal" )
gt_create_multi(w_clinical_subtype_results,subtype_names)%>% tab_source_note("Table 10: Average performance from K-fold CV on with weighted classification models by subtype")| Step Log | Full Log | Rf | Knn | Xgb | |
|---|---|---|---|---|---|
| Luminal A | |||||
| Accuracy 1 | 0.1800339 | 0.79562134 | 0.76108541 | 0.869071847 | 0.8489516 |
| Kappa 1 | 0.0000000 | 0.35783302 | 0.32170170 | 0.319000179 | 0.0000000 |
| Balanced Accuracy 1 | 0.5000000 | 0.72817451 | 0.72619187 | 0.633556601 | 0.5000000 |
| HER2 | |||||
| Accuracy 2 | 0.4245370 | 0.76712963 | 0.60324074 | 0.717592593 | 0.6384259 |
| Kappa 2 | 0.0000000 | 0.45371249 | 0.00000000 | 0.327580981 | 0.0000000 |
| Balanced Accuracy 2 | 0.5000000 | 0.72190476 | 0.50000000 | 0.665912698 | 0.5000000 |
| Normal | |||||
| Accuracy 3 | 0.4126706 | 0.75175439 | 0.62534113 | 0.835964912 | 0.8573099 |
| Kappa 3 | 0.0000000 | 0.12834773 | 0.13242737 | -0.025406898 | 0.0000000 |
| Balanced Accuracy 3 | 0.5000000 | 0.59458790 | 0.62597161 | 0.489057734 | 0.5000000 |
| Luminal B | |||||
| Accuracy 4 | 0.4446634 | 0.58152709 | 0.58862890 | 0.691954023 | 0.7093596 |
| Kappa 4 | 0.0000000 | 0.11513964 | 0.15278453 | 0.152119709 | 0.0000000 |
| Balanced Accuracy 4 | 0.5000000 | 0.57186145 | 0.59567870 | 0.568801098 | 0.5000000 |
| Basal | |||||
| Accuracy 5 | 0.3265931 | 0.51041667 | 0.50453431 | 0.622549020 | 0.6734069 |
| Kappa 5 | 0.0000000 | 0.05343929 | -0.02151625 | -0.004598079 | 0.0000000 |
| Balanced Accuracy 5 | 0.5000000 | 0.54038392 | 0.48993354 | 0.500277454 | 0.5000000 |
| Table 10: Average performance from K-fold CV on with weighted classification models by subtype | |||||
The best performing clinical models have the impressive scores for each metric. While, the models built on single cell spatial interaction data have the worst. Thus, it seems as though using a novel dataset (clinical data) is more effective than IMC data in predicting the survival status of diabetic patients.
model = list(clinical_results, ge_results, sc_proportions_processed_results, sc_spatial_interactions_results)
ns = c("Clinical", "Gene Expression", "Single-Cell Proportions", "Single-Cell Spatial Interactions")
overall_summary = gt_best_in_data(model, ns = ns) %>% list()
overall_summary[[1]] %>% tab_source_note("Table 11: Summary table of the best performing models from each dataset")| Dataset | Score | Model |
|---|---|---|
| Accuracy | ||
| Clinical | 0.78225352 | Step Log |
| Gene Expression | 0.73295775 | Step Log |
| Single-Cell Proportions | 0.75605634 | Step Log |
| Single-Cell Spatial Interactions | 0.72422535 | Step Log |
| Kappa | ||
| Clinical | 0.23158115 | Full Log |
| Gene Expression | 0.06483614 | Step Log |
| Single-Cell Proportions | 0.09399390 | Rf |
| Single-Cell Spatial Interactions | -0.01438837 | Full Log |
| Balanced Accuracy | ||
| Clinical | 0.60075250 | Full Log |
| Gene Expression | 0.52855187 | Step Log |
| Single-Cell Proportions | 0.54413503 | Rf |
| Single-Cell Spatial Interactions | 0.49366447 | Full Log |
| Table 11: Summary table of the best performing models from each dataset | ||
Ali, H., Jackson, H., Zanotelli, V., Danenberg, E., Fischer, J., & Bardwell, H. et al. (2020). Imaging mass cytometry and multiplatform genomics define the phenogenomic landscape of breast cancer. Nature Cancer, 1(2), 163-175. doi: 10.1038/s43018-020-0026-6
C. Sievert. Interactive Web-Based Data Visualization with R, plotly, and shiny. Chapman and Hall/CRC Florida, 2020.
McHugh, M. L. (2012). Interrater reliability: the kappa statistic. Biochemia medica, 22(3), 276-282.
Nicolas Canete and Ellis Patrick (2021). spicyR: Spatial analysis of in situ cytometry data. R package version 1.3.2.
Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, G.K. (2015). limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47.
Tarr G, Müller S, Welsh AH (2018). “mplot: An R Package for Graphical Model Stability and Variable Selection Procedures.” Journal of Statistical Software, 83(9), 1-28. doi: 10.18637/jss.v083.i09 (URL: https://doi.org/10.18637/jss.v083.i09).
Wickham et al., (2019). Welcome to the tidyverse. Journal of Open Source Software, 4(43), 1686, https://doi.org/10.21105/joss.01686